#Resources
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import os
df = pd.read_csv('kaggle_survey_2021_responses.csv')
df.shape
C:\Users\Thinkpad\AppData\Local\Temp\ipykernel_14620\98025904.py:1: DtypeWarning: Columns (0,195,201,285,286,287,288,289,290,291,292) have mixed types. Specify dtype option on import or set low_memory=False.
df = pd.read_csv('kaggle_survey_2021_responses.csv')
(25974, 369)
df.describe()
| Time from Start to Finish (seconds) | Q1 | Q2 | Q3 | Q4 | Q5 | Q6 | Q7_Part_1 | Q7_Part_2 | Q7_Part_3 | ... | Q38_B_Part_3 | Q38_B_Part_4 | Q38_B_Part_5 | Q38_B_Part_6 | Q38_B_Part_7 | Q38_B_Part_8 | Q38_B_Part_9 | Q38_B_Part_10 | Q38_B_Part_11 | Q38_B_OTHER | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 25974 | 25974 | 25974 | 25974 | 25974 | 25974 | 25974 | 21861 | 5335 | 10757 | ... | 634 | 592 | 4240 | 730 | 738 | 1021 | 667 | 2748 | 4543 | 378 |
| unique | 5410 | 12 | 6 | 67 | 8 | 16 | 8 | 2 | 2 | 2 | ... | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 |
| top | 484 | 25-29 | Man | India | Master’s degree | Student | 1-3 years | Python | R | SQL | ... | Comet.ml | Sacred + Omniboard | TensorBoard | Guild.ai | Polyaxon | ClearML | Domino Model Monitor | MLflow | None | Other |
| freq | 42 | 4931 | 20598 | 7434 | 10132 | 6804 | 7874 | 21860 | 5334 | 10756 | ... | 633 | 591 | 4239 | 729 | 737 | 1020 | 666 | 2747 | 4542 | 377 |
4 rows × 369 columns
df.dtypes
Time from Start to Finish (seconds) object
Q1 object
Q2 object
Q3 object
Q4 object
...
Q38_B_Part_8 object
Q38_B_Part_9 object
Q38_B_Part_10 object
Q38_B_Part_11 object
Q38_B_OTHER object
Length: 369, dtype: object
df.head()
| Time from Start to Finish (seconds) | Q1 | Q2 | Q3 | Q4 | Q5 | Q6 | Q7_Part_1 | Q7_Part_2 | Q7_Part_3 | ... | Q38_B_Part_3 | Q38_B_Part_4 | Q38_B_Part_5 | Q38_B_Part_6 | Q38_B_Part_7 | Q38_B_Part_8 | Q38_B_Part_9 | Q38_B_Part_10 | Q38_B_Part_11 | Q38_B_OTHER | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Duration (in seconds) | What is your age (# years)? | What is your gender? - Selected Choice | In which country do you currently reside? | What is the highest level of formal education ... | Select the title most similar to your current ... | For how many years have you been writing code ... | What programming languages do you use on a reg... | What programming languages do you use on a reg... | What programming languages do you use on a reg... | ... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... |
| 1 | 910 | 50-54 | Man | India | Bachelor’s degree | Other | 5-10 years | Python | R | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 784 | 50-54 | Man | Indonesia | Master’s degree | Program/Project Manager | 20+ years | NaN | NaN | SQL | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | None | NaN |
| 3 | 924 | 22-24 | Man | Pakistan | Master’s degree | Software Engineer | 1-3 years | Python | NaN | NaN | ... | NaN | NaN | TensorBoard | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 575 | 45-49 | Man | Mexico | Doctoral degree | Research Scientist | 20+ years | Python | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | None | NaN |
5 rows × 369 columns
# Remove the top row
df = df.iloc[1:, :]
# let's take a look at null values
df.isnull().sum() / df.shape[0]
Time from Start to Finish (seconds) 0.000000
Q1 0.000000
Q2 0.000000
Q3 0.000000
Q4 0.000000
...
Q38_B_Part_8 0.960728
Q38_B_Part_9 0.974358
Q38_B_Part_10 0.894236
Q38_B_Part_11 0.825126
Q38_B_OTHER 0.985485
Length: 369, dtype: float64
I wanna make it easier to run analysis on each question. What if I put all questions in a dictionary along with their answers as a dataframe. This would help me understand each one rather than doing filters each time.
questions = {}
# Key => question number
# Value => dataframe includes all answers (some has multiple answers or parts)
# let's start with keys
keys = list(dict.fromkeys([i.split('_')[0] for i in df.columns])) # Q1_Part_1
# for instance, each question that starts with (Q4) should be put together in the same df
# note: for Q1 if we use (startswith) questions like Q11,Q12, Q13, ... will be put accedently with Q1
# goes the same for Q2 and Q3
for key in keys:
if key in ['Q1','Q2', 'Q3']:
questions[key] = df[key]
else:
questions[key] = df[[q for q in df.columns.values if q.startswith(key)]]
# let's look at kagglers age dist
fig = px.histogram(df, x="Q1")
fig.show()
# look at age VS how many years they've been writing code
# my guiss is older people tend to write more code, let's see
# this time i'll use heatmap
fig = px.density_heatmap(df, x='Q1', y='Q6', category_orders={'Q1': ['18-21','22-24','25-29','30-34','35-39','40-44','45-49','50-54', '55-59','60-69','70+'], 'Q6': ['I have never written code', '<1 year', '1-2 years', '3-5 years', '5-10 years', '10-20 years', '20+ years']})
fig.show()
# Q7 look at programming languages
# Rename columns to be (python, R, SQL, ....) instead of (Q7_Part_1, Q7_Part_2,...)
# so basically I will reanme each columns with the most frequesnt value
questions['Q7'].columns = list(questions['Q7'].mode().values)
# transform it to be each skill and the count
q7 = questions['Q7'].count().reset_index()
# rename columns from (level_0, 0) to (language, count)
q7.columns = ['language', 'count']
# sort by count
q7 = q7.sort_values('count', ascending=False)
fig = px.bar(q7, x='language', y='count')
fig.show()
q = questions['Q7'].copy()
q = q.assign(Roles=df['Q5'])
q.columns
MultiIndex([( 'Python',),
( 'R',),
( 'SQL',),
( 'C',),
( 'C++',),
( 'Java',),
('Javascript',),
( 'Julia',),
( 'Swift',),
( 'Bash',),
( 'MATLAB',),
( 'None',),
( 'Other',),
( 'Roles',)],
)
# I wanna understand everything like (eduaction level, pay, ...) by postion or role (data scientis, analyst,...)
# I will use the same approach to create data snippits for each role
roles = {}
for role in df['Q5'].unique():
roles[role] = df[df['Q5'] == role]
roles.keys()
dict_keys(['Other', 'Program/Project Manager', 'Software Engineer', 'Research Scientist', 'Currently not employed', 'Student', 'Data Scientist', 'Data Analyst', 'Machine Learning Engineer', 'Business Analyst', 'Data Engineer', 'Product Manager', 'Statistician', 'Developer Relations/Advocacy', 'DBA/Database Engineer'])
# How does education level (Q4) vary by role
# let's first look at the whole thing
edu = df['Q4'].value_counts()
fig = px.bar(q7, x=edu.index, y=edu)
fig.show()
# what about data scientis
ds_edu = roles['Data Scientist']['Q4'].value_counts()
fig = px.bar(q7, x=ds_edu.index, y=ds_edu)
fig.show()
# wow, most data scientists tend to get a master degree
# but you still can be a data scientist with bachelor's degree. Thank god
# let's make it for some roles
# have googled on how to filter df using dropdown menu and got it in the answer below
#https://stackoverflow.com/questions/59406167/plotly-how-to-filter-a-pandas-dataframe-using-a-dropdown-menu
fig = go.Figure()
fig.update_layout(
title="Education Level By Role",
xaxis_title="Education Level",
yaxis_title="Count",
)
# set up ONE trace
fig.add_trace(go.Bar(x=edu.index, y=edu.values))
buttons = []
for role in roles.keys():
buttons.append(dict(method='restyle',
label=role,
visible=True,
args=[{'y':[roles[role]['Q4'].value_counts().values],
'x':[roles[role]['Q4'].value_counts().index],
'type':'bar'}, [0]],
))
# # Have no clue what is that and how it works, just wanna something to show
updatemenu = []
your_menu = dict()
updatemenu.append(your_menu)
updatemenu[0]['buttons'] = buttons
updatemenu[0]['direction'] = 'down'
updatemenu[0]['showactive'] = True
# add dropdown menus to the figure
fig.update_layout(showlegend=False, updatemenus=updatemenu)
fig.show()
# makes sense that role like research sientists are most likely to have a doctoral degree
# Data scientists most likely to have a master degree but again a lot don't
# will try to make bars represent percentage instead of row numbers
fig = go.Figure()
fig.update_layout(
title="Comparing Education Level By Role",
xaxis_title="Education Level",
yaxis_title="persent",
)
# set up ONE trace
# let's make y as percent instead of row numbers
fig.add_trace(go.Bar(name='Role', x=edu.index, y=(edu.values/edu.sum())))
buttons = []
for role in roles.keys():
role_edu = roles[role]['Q4'].value_counts()
buttons.append(dict(method='restyle',
label=role,
visible=True,
args=[{'y':[role_edu.values/(role_edu.sum())],
'x':[role_edu.index],
'type':'bar'}, [0]],
))
# # Have no clue what is that and how it works, just wanna something to show
updatemenu = []
your_menu = dict()
updatemenu.append(your_menu)
updatemenu[0]['buttons'] = buttons
updatemenu[0]['direction'] = 'down'
updatemenu[0]['showactive'] = True
# add dropdown menus to the figure
fig.update_layout(showlegend=False, updatemenus=updatemenu)
fig.show()
# it turns out the around 50% of data scientists have master degree
# Let's try to compare edu by roles in the same graph
# let's create two dropdowns
fig = go.Figure()
fig.update_layout(
title="Comparing Education Level By Role",
xaxis_title="Education Level",
yaxis_title="persent",
)
# set up ONE trace
# let's make y as percent instead of row numbers
fig.add_trace(go.Bar(name="1st selection", x=edu.index, y=(edu.values/edu.sum())))
# it figures out we can create two button lists
buttons1 = []
buttons2 = []
for role in roles.keys():
role_edu = roles[role]['Q4'].value_counts()
buttons1.append(dict(method='restyle',
label=role,
visible=True,
args=[{'y':[role_edu.values/(role_edu.sum())],
'x':[role_edu.index],
'type':'bar'}, [0]],
))
# it figures out we can add one more trace and buttons to display two roles at a time
fig.add_trace(go.Bar(name='2nd selection', x=edu.index, y=(edu.values/edu.sum())))
buttons2 = []
for role in roles.keys():
role_edu = roles[role]['Q4'].value_counts()
buttons2.append(dict(method='restyle',
label=role,
visible=True,
args=[{'y':[role_edu.values/(role_edu.sum())],
'x':[role_edu.index],
'type':'bar'}, [1]], # just played around this index (0,1) and it figures
# 0 for 1st trace and 1 for the second, not sure but it works
))
# I've got this now, we can use it to customize each button
#https://plotly.com/python/reference/layout/updatemenus/
updatemenu = [
dict(buttons=buttons1,
direction='down',
pad={'r': 10, 't': 10, 'b': 10},
showactive=True,
x=0.1,
xanchor='left',
y=1.23,
yanchor='top'
),
dict(buttons=buttons2,
direction='down',
pad={'r': 10, 't': 10},
showactive=True,
x=0.5,
xanchor='left',
y=1.23,
yanchor='top'
)
]
# add dropdown menus to the figure
fig.update_layout(updatemenus=updatemenu)
fig.update_layout(annotations = [
dict(text = '1st selection', x = 0, xref='paper', y=1.15, yref='paper', align='left', showarrow=False),
dict(text = '2nd selection', x = 0.45, xref='paper', y=1.15, yref='paper', align='right', showarrow=False)
])
fig.show()
# let's look at what programming language kagglers recommend by role
recommended_languages = df['Q8'].value_counts()
fig = go.Figure()
fig.update_layout(
title="Recommended languages By Role",
xaxis_title="Programming Languages",
yaxis_title="persent",
)
# set up ONE trace
# let's make y as percent instead of row numbers
fig.add_trace(go.Bar(name="1st selection", x=recommended_languages.index, y=(recommended_languages.values/recommended_languages.values.sum())))
# it figures out we can create two button lists
buttons1 = []
buttons1.append(dict(method='restyle',
label='All data',
visible=True,
args=[{'y':[recommended_languages.values/(recommended_languages.sum())],
'x':[recommended_languages.index],
'type':'bar'}, [0]],
))
for role in roles.keys():
recommended_languages_byrole = roles[role]['Q8'].value_counts()
buttons1.append(dict(method='restyle',
label=role,
visible=True,
args=[{'y':[recommended_languages_byrole.values/(recommended_languages_byrole.sum())],
'x':[recommended_languages_byrole.index],
'type':'bar'}, [0]],
))
# it figures out we can add one more trace and buttons to display two roles at a time
fig.add_trace(go.Bar(name='2nd selection', x=recommended_languages.index, y=(recommended_languages.values/recommended_languages.values.sum())))
buttons2 = []
buttons2.append(dict(method='restyle',
label='All data',
visible=True,
args=[{'y':[recommended_languages.values/(recommended_languages.sum())],
'x':[recommended_languages.index],
'type':'bar'}, [1]],
))
for role in roles.keys():
recommended_languages_byrole = roles[role]['Q8'].value_counts()
buttons2.append(dict(method='restyle',
label=role,
visible=True,
args=[{'y':[recommended_languages_byrole.values/(recommended_languages_byrole.sum())],
'x':[recommended_languages_byrole.index],
'type':'bar'}, [1]], # just played around this index (0,1) and it figures
# 0 for 1st trace and 1 for the second, not sure but it works
))
# I've got this now, we can use it to customize each button
#https://plotly.com/python/reference/layout/updatemenus/
updatemenu = [
dict(buttons=buttons1,
direction='down',
pad={'r': 10, 't': 10, 'b': 10},
showactive=True,
x=0.1,
xanchor='left',
y=1.15,
yanchor='top'
),
dict(buttons=buttons2,
direction='down',
pad={'r': 10, 't': 10},
showactive=True,
x=0.5,
xanchor='left',
y=1.15,
yanchor='top'
)
]
# add dropdown menus to the figure
fig.update_layout(updatemenus=updatemenu)
fig.update_layout(annotations = [
dict(text = '1st selection', x = 0, xref='paper', y=1.15, yref='paper', align='left', showarrow=False),
dict(text = '2nd selection', x = 0.45, xref='paper', y=1.15, yref='paper', align='right', showarrow=False)
])
fig.show()
# wow. About 80% recommedned Python
# evnen statisticians who use R most commonly, still recommend python though
# if you coming in and trying to understand what should i learn first, pyhton is the number 1 thing
# so good so far, but I wanna make a kind function that can help us generate those graphs
def filter_bars(role, data):
df = data[data['Roles'] == role]
q = df.drop('Roles', axis= 1).count().reset_index()
q.columns = ['feature','Count']
return (q.feature, q.Count/q.Count.sum())
def build_graph(question, Roles, Title):
"""
first case: questions that have multiple columns
I'll handle it as the same as Q7 above
"""
if isinstance(question, pd.DataFrame):
q_copy = question.copy()
q_copy.columns = list(q_copy.mode().iloc[0,:])
q_feature_count = q_copy.count().reset_index()
q_feature_count.columns = ['feature','Count']
q_feature_count = q_feature_count.sort_values('Count', ascending = False)
q_copy['Roles'] = df.Q5
fig = go.Figure(layout=go.Layout(title= go.layout.Title(text=Title)))
#changed from role selection to selection 1
fig.add_trace(go.Bar(name= 'Selection 1', x= q_feature_count.feature, y=(q_feature_count.Count/ q_feature_count.Count.sum())))
buttons = []
#added button for all data comparison
buttons.append(dict(method='restyle',
label= 'All Samples',
visible=True,
args=[{'y':[(q_feature_count.Count/ q_feature_count.Count.sum())],
'x':[q_feature_count.feature],
'type':'bar'}, [0]], # the [0] at the end lets us know they are for the first trace
)
)
for i in list(Roles.keys())[1:]:
buttons.append(dict(method='restyle',
label= i,
visible=True,
args=[{'y':[filter_bars(i,q_copy)[1].values],
'x':[filter_bars(i,q_copy)[0].values],
'type':'bar'}, [0]], # the [0] at the end lets us know they are for the first trace
)
)
fig.add_trace(go.Bar(name= 'Selection 2', x= q_feature_count.feature, y=(q_feature_count.Count/ q_feature_count.Count.sum())))
buttons_2 = []
#added button for all data comparison
buttons_2.append(dict(method='restyle',
label= 'All Samples',
visible=True,
args=[{'y':[(q_feature_count.Count/ q_feature_count.Count.sum())],
'x':[q_feature_count.feature],
'type':'bar'}, [1]], # the [0] at the end lets us know they are for the first trace
)
)
for i in list(Roles.keys())[1:]:
buttons_2.append(dict(method='restyle',
label= i,
visible=True,
args=[{'y':[filter_bars(i,q_copy)[1].values],
'x':[filter_bars(i,q_copy)[0].values],
'type':'bar'}, [1]], # the [0] at the end lets us know they are for the first trace
)
)
button_layer_1_height = 1.15
updatemenus = list([
dict(buttons=buttons,
direction="down",
pad={"r": 10, "t": 10},
showactive=True,
x=0.1,
xanchor="left",
y=button_layer_1_height,
yanchor="top"),
dict(buttons=buttons_2,
direction="down",
pad={"r": 10, "t": 10},
showactive=True,
x=0.50,
xanchor="left",
y=button_layer_1_height,
yanchor="top")
])
fig.update_layout( updatemenus=updatemenus)
#added annotations next to dropdowns
fig.update_layout(
annotations=[
dict(text="Selection 1", x=0, xref="paper", y=1.1, yref="paper",
align="left", showarrow=False),
dict(text="Selection 2", x=0.45, xref="paper", y=1.1,
yref="paper", showarrow=False)
])
fig.update_xaxes(categoryorder= 'array', categoryarray= q_feature_count.feature)
fig.show()
else:
print('else')
"""
2nd case is questions that have only one column like Q2 and Q8 above
"""
qnumber= question.copy()
vcnts = qnumber.value_counts()
qnumber = pd.concat([qnumber,df.Q5], axis =1)
qnumber.columns = ['feature','Roles']
fig = go.Figure(layout=go.Layout(title= go.layout.Title(text=Title)))
#changed from role selection to selection 1
fig.add_trace(go.Bar(name= 'Selection 1', x= vcnts.index, y=(vcnts.values/ vcnts.values.sum())))
buttons = []
#added button for all data comparison
buttons.append(dict(method='restyle',
label= 'All Samples',
visible=True,
args=[{'y':[vcnts.values/ vcnts.values.sum()],
'x':[vcnts.index],
'type':'bar'}, [0]], # the [0] at the end lets us know they are for the first trace
)
)
for i in list(Roles.keys())[1:]:
qrole = qnumber[qnumber['Roles']==i].feature.value_counts()
buttons.append(dict(method='restyle',
label= i,
visible=True,
args=[{'y':[qrole.values/qrole.values.sum()],
'x':[qrole.index],
'type':'bar'}, [0]], # the [0] at the end lets us know they are for the first trace
)
)
fig.add_trace(go.Bar(name= 'Selection 1', x= vcnts.index, y=(vcnts.values/ vcnts.values.sum())))
buttons_2 = []
#added button for all data comparison
buttons_2.append(dict(method='restyle',
label= 'All Samples',
visible=True,
args=[{'y':[vcnts.values/ vcnts.values.sum()],
'x':[vcnts.index],
'type':'bar'}, [1]], # the [0] at the end lets us know they are for the first trace
)
)
for i in list(Roles.keys())[1:]:
qrole = qnumber[qnumber['Roles']==i].feature.value_counts()
buttons_2.append(dict(method='restyle',
label= i,
visible=True,
args=[{'y':[qrole.values/qrole.values.sum()],
'x':[qrole.index],
'type':'bar'}, [1]], # the [0] at the end lets us know they are for the first trace
)
)
button_layer_1_height = 1.15
updatemenus = list([
dict(buttons=buttons,
direction="down",
pad={"r": 10, "t": 10},
showactive=True,
x=0.1,
xanchor="left",
y=button_layer_1_height,
yanchor="top"),
dict(buttons=buttons_2,
direction="down",
pad={"r": 10, "t": 10},
showactive=True,
x=0.50,
xanchor="left",
y=button_layer_1_height,
yanchor="top")
])
fig.update_layout( updatemenus=updatemenus)
#added annotations next to dropdowns
fig.update_layout(
annotations=[
dict(text="Selection 1", x=0, xref="paper", y=1.1, yref="paper",
align="left", showarrow=False),
dict(text="Selection 2", x=0.45, xref="paper", y=1.1,
yref="paper", showarrow=False)
])
fig.update_xaxes(categoryorder= 'array', categoryarray= vcnts.index)
fig.show()
return
# maybe it's not the most elegant way to do it but that's what i've come up with. I might revist it later
build_graph(questions['Q7'], roles, 'Popular Programming Language')
# Python, SQL, and R are the most popular ones
# All people tend to use pyhton much more than R except for Statisticians :)
# seems like analysts are using SQL a bit more than data scientists
build_graph(df['Q8'], roles, 'Recommended language')
else
build_graph(questions['Q9'], roles, 'IDE')
build_graph(questions['Q14'], roles, 'most common data viz library')
build_graph(questions['Q31'], roles, 'most common PI softwares')
build_graph(questions['Q38'], roles, 'Primary tool used in analytics')